import os
import random
import itertools
import pandas as pd
pd.set_option('display.max_columns', 500)
import numpy as np
import warnings
warnings.filterwarnings("ignore")
import seaborn as sns
sns.set(style="darkgrid")
from IPython import display
import matplotlib.pyplot as plt
import plotly.figure_factory as ff
import plotly.offline as py
import plotly.graph_objs as go
import plotly.express as px
import datetime
import matplotlib.pyplot as plt
import plotly.io as pio
from plotly import tools
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score,accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import xgboost as xgb
display.Image("./project WorkFlow.png")
display.Image("./Predictions Flow.png")
def classifcation_report_train_test(y_train, y_train_pred, y_test, y_test_pred):
print('''
=========================================
CLASSIFICATION REPORT FOR TRAIN DATA
=========================================
''')
print(classification_report(y_train, y_train_pred))
print('''
=========================================
CLASSIFICATION REPORT FOR TEST DATA
=========================================
''')
print(classification_report(y_test, y_test_pred))
def model_selection(classifier, name, X_train, y_train,X_test,y_test):
classifier.fit(X_train, y_train)
pred_train = classifier.predict(X_train)
pred_test = classifier.predict(X_test)
results_dict = {}
results_dict['classifier_name'] = name
results_dict['Train_Accuracy'] = accuracy_score(y_train,pred_train)
results_dict['Valid_Accuracy'] = accuracy_score(y_test,pred_test)
return(results_dict)
from sklearn.metrics import recall_score,accuracy_score,precision_score
scores = pd.DataFrame(columns=['Model','Train_Accuracy','Test_Accuracy','Train_Recall_macro','Test_Recall_macro','Train_Precision_macro','Test_Precision_macro'])
def get_metrics(train_actual,train_predicted,test_actual,test_predicted,model_description,dataframe):
train_accuracy = accuracy_score(train_actual,train_predicted)
test_accuracy = accuracy_score(test_actual,test_predicted)
train_recall = recall_score(train_actual,train_predicted,average='macro')
test_recall = recall_score(test_actual,test_predicted,average='macro')
train_precision = recall_score(train_actual,train_predicted,average='macro')
test_precision = recall_score(test_actual,test_predicted,average='macro')
dataframe = dataframe.append(pd.Series([model_description,train_accuracy,test_accuracy,train_recall,
test_recall,train_precision,test_precision],
index=scores.columns ), ignore_index=True)
return(dataframe)
def plot_confusion_matrix(cm, classes,
normalize=False,
title='Confusion matrix',
cmap=plt.cm.Blues):
"""
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
"""
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print("Normalized confusion matrix")
else:
print('Confusion matrix')
print(cm)
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
fmt = '.2f' if normalize else 'd'
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, format(cm[i, j], fmt),
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
customer_data = pd.read_excel('./TrainData/Customerdata_Train.xlsx')
email_data = pd.read_excel('./TrainData/Emaildata_Train.xlsx')
train_data = pd.read_excel('./TrainData//Train.xlsx')
transactions_data = pd.read_excel('./TrainData/transactionsdata_Train.xlsx')
c_data = customer_data.copy()
e_data = email_data.copy()
t_data = train_data.copy()
trans_data = transactions_data.copy()
e_data = pd.get_dummies(data=e_data, columns=['EmailType', 'MailOpened','MailClicked'])
e_data = e_data.groupby('CustomerID').sum()
e_data.head()
email_data[email_data["EmailType"] =='WelcomeEmail']
email_data["DateOfemail"] =pd.to_datetime(email_data["DateOfemail"])
email_date = email_data.groupby('CustomerID').agg(welcomeEmailDate=('DateOfemail', np.min), RecentInteraction=('DateOfemail', np.max))
email_date.head()
email = e_data.merge(email_date, how='inner', on='CustomerID')
email.head()
trans_order = trans_data.groupby('CustomerID').agg(totalOrders=('OrderQuantity', np.sum),OrderFrequency = ('OrderQuantity',np.count_nonzero ))
trans_date = trans_data.groupby('CustomerID').agg(firstOrder=('DateOfOrder', np.min), lastOrder=('DateOfOrder', np.max))
transactions = trans_order.merge(trans_date, on = 'CustomerID', how= 'inner')
transactions.head()
data = email.merge(transactions, on = 'CustomerID' , how= 'inner')
data.head()
data = customer_data.merge(data, on = 'CustomerID' , how= 'inner')
data.head()
final_data = data.merge(train_data, on = 'CustomerID' , how= 'inner')
final_data.head()
final_data['numberOfDaysWithBusiness'] = final_data['RecentInteraction']-final_data['DateOfRegistration']
final_data['makingBusiness'] = pd.to_datetime(final_data['lastOrder'])- pd.to_datetime(final_data['firstOrder'])
final_data['ordergap'] = pd.to_datetime(final_data['lastOrder']).max() - pd.to_datetime(final_data['lastOrder'])
final_data['ordergap'] = final_data['ordergap'].dt.days
final_data['numberOfDaysWithBusiness'] = final_data['numberOfDaysWithBusiness'].dt.days
final_data['makingBusiness'] = final_data['makingBusiness'].dt.days
final_data.head()
def red_data(data):
return pd.DataFrame({"Data Type":data.dtypes,"No of Unique Data":data.apply(lambda x: x.nunique(),axis=0),
"Levels":data.apply(lambda x: str(x.unique()),axis = 0),"Null_values":data.isnull().sum(),'null%':(data.isnull().sum()/100),'skewness':data.skew(),'kurtosis':data.kurt()})
red_data(final_data)
date_viz = final_data.copy()
final_data = final_data[['CustomerID', 'City', 'OnlineCommunication',
'AutomaticRefill', 'DoorstepDelivery', 'PreferredDeliveryDay',
'EmailType_CustomizedEmail', 'MailOpened_no',
'MailOpened_yes', 'MailClicked_no',
'MailClicked_yes',
'totalOrders', 'OrderFrequency',
'RetentionPotential', 'numberOfDaysWithBusiness', 'makingBusiness','ordergap']]
final_data.head()
final_data.describe()
from string import ascii_letters
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="white")
corr = final_data.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
f, ax = plt.subplots(figsize=(11, 9))
cmap = sns.diverging_palette(230, 20, as_cmap=True)
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=1, center=0,
square=True, linewidths=.5, cbar_kws={"shrink": .5})
final_data.plot(kind='box',layout=(2,7),subplots=1,figsize=(20,16))
colors = ['Accent', 'Accent_r', 'Blues', 'Blues_r', 'BrBG', 'BrBG_r', 'BuGn', 'BuGn_r', 'BuPu', 'BuPu_r', 'CMRmap', 'CMRmap_r', 'Dark2', 'Dark2_r', 'GnBu', 'GnBu_r', 'Greens', 'Greens_r', 'Greys', 'Greys_r', 'OrRd', 'OrRd_r', 'Oranges', 'Oranges_r', 'PRGn', 'PRGn_r', 'Paired', 'Paired_r', 'Pastel1', 'Pastel1_r', 'Pastel2', 'Pastel2_r', 'PiYG', 'PiYG_r', 'PuBu', 'PuBuGn', 'PuBuGn_r', 'PuBu_r', 'PuOr', 'PuOr_r', 'PuRd', 'PuRd_r', 'Purples', 'Purples_r', 'RdBu', 'RdBu_r', 'RdGy', 'RdGy_r', 'RdPu', 'RdPu_r', 'RdYlBu', 'RdYlBu_r', 'RdYlGn', 'RdYlGn_r', 'Reds', 'Reds_r', 'Set1', 'Set1_r', 'Set2', 'Set2_r', 'Set3', 'Set3_r', 'Spectral', 'Spectral_r', 'Wistia', 'Wistia_r', 'YlGn', 'YlGnBu', 'YlGnBu_r', 'YlGn_r', 'YlOrBr', 'YlOrBr_r', 'YlOrRd', 'YlOrRd_r', 'afmhot', 'afmhot_r', 'autumn', 'autumn_r', 'binary', 'binary_r', 'bone', 'bone_r', 'brg', 'brg_r', 'bwr', 'bwr_r', 'cividis', 'cividis_r', 'cool', 'cool_r', 'coolwarm', 'coolwarm_r', 'copper', 'copper_r', 'crest', 'crest_r', 'cubehelix', 'cubehelix_r', 'flag', 'flag_r', 'flare', 'flare_r', 'gist_earth', 'gist_earth_r', 'gist_gray', 'gist_gray_r', 'gist_heat', 'gist_heat_r', 'gist_ncar', 'gist_ncar_r', 'gist_rainbow', 'gist_rainbow_r', 'gist_stern', 'gist_stern_r', 'gist_yarg', 'gist_yarg_r', 'gnuplot', 'gnuplot2', 'gnuplot2_r', 'gnuplot_r', 'gray', 'gray_r', 'hot', 'hot_r', 'hsv', 'hsv_r', 'icefire', 'icefire_r', 'inferno', 'inferno_r', 'jet', 'jet_r', 'magma', 'magma_r', 'mako', 'mako_r', 'nipy_spectral', 'nipy_spectral_r', 'ocean', 'ocean_r', 'pink', 'pink_r', 'plasma', 'plasma_r', 'prism', 'prism_r', 'rainbow', 'rainbow_r', 'rocket', 'rocket_r', 'seismic', 'seismic_r', 'spring', 'spring_r', 'summer', 'summer_r', 'tab10', 'tab10_r', 'tab20', 'tab20_r', 'tab20b', 'tab20b_r', 'tab20c', 'tab20c_r', 'terrain', 'terrain_r', 'turbo', 'turbo_r', 'twilight', 'twilight_r', 'twilight_shifted', 'twilight_shifted_r', 'viridis', 'viridis_r', 'vlag', 'vlag_r', 'winter', 'winter_r']
columns = ['City','OnlineCommunication','AutomaticRefill','DoorstepDelivery','PreferredDeliveryDay','RetentionPotential']
for col in columns:
sns.countplot(x=col, data=final_data,palette=random.choice(colors))
plt.title('Number Of {}'.format(col))
plt.show()
#plt.title('Comparision Between the {},{} & Customer {}'.format(x,col,'RetentionPotential'))
columns = ['OnlineCommunication','AutomaticRefill','DoorstepDelivery','PreferredDeliveryDay']
for i in columns:
g = sns.catplot(x="City", hue="RetentionPotential", col=i,
data=final_data, kind="count",
height=4, aspect=.7,palette=random.choice(colors));
g.fig.subplots_adjust(top=0.8)
g.fig.suptitle('Comparision Between the City , {} , Customer RetentionPotential'.format(i))
final_data.head()
final_data[final_data['totalOrders'] >50000 ]
date_viz['firstOrder']= pd.to_datetime(date_viz['firstOrder'])
#df['Ship Date']= pd.to_datetime(df['Ship Date'])
daily_quantity = date_viz.groupby('firstOrder')['OrderFrequency'].count().reset_index()
trace0 = go.Scatter(x=daily_quantity['firstOrder'], y=date_viz['OrderFrequency'], name='nflx')
fig0 = go.Figure([trace0])
fig0.update_layout(
title={
'text': "Total OrderFrequency by Day",
'x':0.5,
'xanchor': 'center'})
fig0.show()
display.Image("./images/total_orderfrequency_vs_day.png")
date_viz['lastOrder']= pd.to_datetime(date_viz['lastOrder'])
#df['Ship Date']= pd.to_datetime(df['Ship Date'])
daily_quantity = date_viz.groupby('lastOrder')['totalOrders'].min().reset_index()
trace0 = go.Scatter(x=daily_quantity['lastOrder'], y=date_viz['totalOrders'], name='nflx')
fig0 = go.Figure([trace0])
fig0.update_layout(
title={
'text': "Total totalOrders by Day",
'x':0.5,
'xanchor': 'center'})
fig0.show()
display.Image("./images/totalorder_vs_day.png")
date_viz['year'] = date_viz['RecentInteraction'].dt.year
date_viz['month'] = date_viz['RecentInteraction'].dt.month
date_viz['dow'] = date_viz['RecentInteraction'].dt.dayofweek
date_viz['day'] = date_viz['RecentInteraction'].dt.day
trace0 = go.Scatter(x=date_viz.groupby('year')['totalOrders'].count().index, y=date_viz.groupby('year')['totalOrders'].count().values)
trace1 = go.Scatter(x=date_viz.groupby('month')['totalOrders'].count().index, y=date_viz.groupby('month')['totalOrders'].count().values)
trace2 = go.Scatter(x=date_viz.groupby('dow')['totalOrders'].count().index, y=date_viz.groupby('dow')['totalOrders'].count().values)
trace3 = go.Scatter(x=date_viz.groupby('day')['totalOrders'].count().index, y=date_viz.groupby('day')['totalOrders'].count().values)
fig1 = tools.make_subplots(rows=4, cols=1, subplot_titles=('Total totalOrders by Year', 'Total totalOrders by Month', 'Total totalOrders by Day of Week', 'Total totalOrders by Day of Month'))
fig1.append_trace(trace0, 1,1)
fig1.append_trace(trace1, 2,1)
fig1.append_trace(trace2, 3,1)
fig1.append_trace(trace3, 4,1)
fig1['layout'].update(title='', showlegend=False)
fig1
display.Image("./images/totalorders.png")
Annual wise- total orders were witnessing more in the period of 2012 and again falling down in the year 2014.
Monthly wise- in the months from last two quarters which means from August to December, most of the festivals fall under these months and people tend to purchase more and in the month January, new year falls and hence the sales raises.
Week wise- the sales are more in the weekends and it gradually steeps down in the weekdays.
Day wise- the salary credits in the first week of the month, the sales immediately rises up to till the mid of the month and rapidly falls down in end of the month.
def gen_scatter(region, col):
return go.Scatter(
x=date_viz[date_viz.City==region].groupby('month')[col].count().index,
y=date_viz[date_viz.City==region].groupby('month')[col].count().values,
name = region,
mode='markers'
)
data = [
gen_scatter('CITY1', 'totalOrders'),
gen_scatter('CITY2', 'totalOrders'),
gen_scatter('CITY3', 'totalOrders'),
gen_scatter('CITY4', 'totalOrders')
]
layout = go.Layout(
title={
'text': "Total totalOrders by City",
'x':0.5,
'xanchor': 'center'}, xaxis = dict(title = 'Month')
)
fig4 = go.Figure(data=data, layout=layout)
fig4
display.Image("./images/city_vs_total_orders.png")
g = sns.relplot(x="PreferredDeliveryDay", y="totalOrders", hue="RetentionPotential", size="MailOpened_yes",
sizes=(40, 400), alpha=.5, palette=random.choice(colors),
height=6, data=final_data)
g.fig.subplots_adjust(top=0.9)
g.fig.suptitle('PreferredDeliveryDay vs totalOrders',size=24)
g = sns.relplot(x="EmailType_CustomizedEmail", y="totalOrders", hue="RetentionPotential", size="MailOpened_yes",
sizes=(40, 400), alpha=.5, palette=random.choice(colors),
height=6, data=final_data)
g.fig.subplots_adjust(top=0.9)
g.fig.suptitle('EmailType_CustomizedEmail vs totalOrders',size=24)
final_data[final_data['EmailType_CustomizedEmail'] >250 ]
g = sns.relplot(x="EmailType_CustomizedEmail", y="MailClicked_yes", hue="RetentionPotential", size="MailOpened_yes",
sizes=(40, 400), alpha=.5, palette=random.choice(colors),
height=6, data=final_data)
g.fig.subplots_adjust(top=0.9)
g.fig.suptitle('EmailType_CustomizedEmail vs MailClicked_yes',size=24)
g = sns.relplot(x="OrderFrequency", y="totalOrders", hue="RetentionPotential", size="EmailType_CustomizedEmail",
sizes=(40, 400), alpha=.5, palette=random.choice(colors),
height=6, data=final_data)
g.fig.subplots_adjust(top=0.9)
g.fig.suptitle('OrderFrequency vs TotalOrders',size=24)
g = sns.relplot(x="MailOpened_yes", y="OrderFrequency", hue="RetentionPotential", size="MailClicked_yes",
sizes=(40, 400), alpha=.5, palette=random.choice(colors),
height=6, data=final_data)
g.fig.subplots_adjust(top=0.9)
g.fig.suptitle('MailOpened_yes vs OrderFrequency',size=24)
g = sns.relplot(x="MailOpened_yes", y="totalOrders", hue="RetentionPotential", size="MailClicked_yes",
sizes=(40, 400), alpha=.5, palette=random.choice(colors),
height=6, data=final_data)
g.fig.subplots_adjust(top=0.9)
g.fig.suptitle('MailOpened_yes vs totalOrders',size=24)
columns = ['PreferredDeliveryDay','OnlineCommunication','AutomaticRefill','DoorstepDelivery']
for cols in columns:
#var_name = "PreferredDeliveryDay"
col_order = np.sort(final_data[cols].unique()).tolist()
plt.figure(figsize=(10,5))
sns.stripplot(x=cols, y='totalOrders', data=final_data, order=col_order,hue = 'RetentionPotential',palette=random.choice(colors))
plt.xlabel(cols, fontsize=12)
plt.ylabel('TotalOrders', fontsize=12)
plt.title("Distribution of TotalOrders variable with "+cols, fontsize=15)
plt.show()
columns = ['PreferredDeliveryDay','OnlineCommunication','AutomaticRefill','DoorstepDelivery']
for cols in columns:
#var_name = "PreferredDeliveryDay"
col_order = np.sort(final_data[cols].unique()).tolist()
plt.figure(figsize=(10,5))
sns.stripplot(x=cols, y='OrderFrequency', data=final_data, order=col_order,hue = 'RetentionPotential',palette=random.choice(colors))
plt.xlabel(cols, fontsize=12)
plt.ylabel('OrderFrequency', fontsize=12)
plt.title("Distribution of OrderFrequency variable with "+cols, fontsize=15)
plt.show()
df = final_data.drop('CustomerID',axis=1).copy()
df.head()
from sklearn.model_selection import train_test_split
y = df['RetentionPotential']
X = df.loc[:, df.columns != 'RetentionPotential']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=111,stratify = y)
print(X_train.shape, X_test.shape)
y_train = y_train.map(dict(High=2, Low=0,Medium=1))
y_test = y_test.map(dict(High=2, Low=0,Medium=1))
print(y_train.shape, y_test.shape)
num_cols = ["EmailType_CustomizedEmail", 'MailOpened_no', 'MailOpened_yes','MailClicked_no','MailClicked_yes','totalOrders','OrderFrequency','numberOfDaysWithBusiness','makingBusiness','ordergap']
cat_cols =['City', 'PreferredDeliveryDay']
from sklearn.preprocessing import StandardScaler
scaler= StandardScaler()
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])
X_train = pd.DataFrame(X_train)
X_test= pd.DataFrame(X_test)
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
le = LabelEncoder()
for col in cat_cols:
le.fit(X_train[col])
X_train[col] = le.transform(X_train[col])
X_test[col] = le.transform(X_test[col])
models = [{'name': 'logreg','label': 'Logistic Regression',
'classifier': LogisticRegression(random_state=88),
},
{'name': 'knn','label':'K Nearest Neighbors',
'classifier':KNeighborsClassifier(),
},
{'name': 'dsc','label': 'Descision Tree',
'classifier': DecisionTreeClassifier(random_state=88),
},
{'name': 'rf', 'label': 'Random Forest',
'classifier': RandomForestClassifier(random_state=88),
},
{'name': 'svc', 'label': 'SVM',
'classifier': SVC(class_weight='balanced',random_state=88),
},
{'name': 'Boosting', 'label': 'Xgboost',
'classifier': xgb.XGBClassifier(class_weight='balanced',objective='multi:softmax',random_state = 88),
}
]
results_base = []
for m in models:
print(m['name'])
results_base.append(model_selection(m['classifier'],
m['name'],
X_train,
y_train,X_test,y_test
))
print('completed')
results_base_models = pd.DataFrame(results_base).sort_values(by='Valid_Accuracy', ascending = False)
results_base_models
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
grid={"C":np.logspace(-3,3,7), "penalty":["l1","l2"]}# l1 lasso l2 ridge
logreg=LogisticRegression()
logreg_cv=GridSearchCV(logreg,grid,cv=10)
logreg_cv.fit(X_train,y_train)
print("tuned hpyerparameters :(best parameters) ",logreg_cv.best_params_)
print("accuracy :",logreg_cv.best_score_)
logreg_cv.best_params_
log_predict_train=logreg_cv.best_estimator_.predict(X_train)
log_predict_test=logreg_cv.best_estimator_.predict(X_test)
classifcation_report_train_test(y_train, log_predict_train, y_test, log_predict_test)
scores = get_metrics(y_train,log_predict_train,y_test,log_predict_test,'Logistic_GridSearchCV',scores)
dt_model = DecisionTreeClassifier(class_weight = 'balanced',random_state=88)
grid = {
"max_depth":np.arange(14)+1,
'criterion':['gini','entropy']
}
grid_cv_dt = GridSearchCV(estimator=dt_model, param_grid=grid, cv=3,verbose=1)
grid_cv_dt.fit(X_train, y_train)
grid_cv_dt.best_estimator_
grid_cv_dt.best_params_
dt_predict_train=grid_cv_dt.best_estimator_.predict(X_train)
dt_predict_test=grid_cv_dt.best_estimator_.predict(X_test)
classifcation_report_train_test(y_train, dt_predict_train, y_test, dt_predict_test)
#scores = get_metrics(y_train,dt_predict_train,y_test,dt_predict_test,'DecisionTrees_GridSearchCV',scores)
import pickle
Pkl_Filename = "Models/DT.pkl"
with open(Pkl_Filename, 'wb') as file:
pickle.dump(grid_cv_dt.best_estimator_, file)
# with open(Pkl_Filename, 'rb') as file:
# Pickled_LR_Model = pickle.load(file)
from sklearn import tree
clf = DecisionTreeClassifier(max_depth = 2)
clf.fit(X_train, y_train)
fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (3,3), dpi=300)
tree.plot_tree(clf,
feature_names =['CustomerID', 'City', 'OnlineCommunication', 'AutomaticRefill',
'DoorstepDelivery', 'PreferredDeliveryDay', 'EmailType_CustomizedEmail',
'MailOpened_no', 'MailOpened_yes', 'MailClicked_no', 'MailClicked_yes',
'totalOrders', 'OrderFrequency',
'numberOfDaysWithBusiness', 'makingBusiness', 'ordergap'],
class_names=['Low','Medium','High'],
filled = True,rounded = True);
cnf_matrix = metrics.confusion_matrix(y_test, dt_predict_test)
np.set_printoptions(precision=2)
class_names = ['Low', 'Medium','High']
plt.figure(figsize=(10,5))
plot_confusion_matrix(cnf_matrix, classes=class_names,
title='Confusion matrix DT')
from sklearn.svm import SVC
svc_grid = SVC(class_weight='balanced')
param_grid = {
'C': [0.001, 0.01, 0.1, 1, 10],
'gamma': [0.001, 0.01, 0.1, 1],
'kernel':['linear', 'rbf']}
svc_cv_grid = GridSearchCV(estimator = svc_grid, param_grid = param_grid, cv = 3,verbose=1)
svc_cv_grid.fit(X_train, y_train)
svc_cv_grid.best_estimator_
#predicting using best_estimator
svc_train_pred = svc_cv_grid.best_estimator_.predict(X_train)
svc_test_pred = svc_cv_grid.best_estimator_.predict(X_test)
classifcation_report_train_test(y_train, svc_train_pred, y_test, svc_test_pred)
def CreateBalancedSampleWeights(y_train, largest_class_weight_coef):
classes = np.unique(y_train, axis = 0)
classes.sort()
class_samples = np.bincount(y_train)
total_samples = class_samples.sum()
n_classes = len(class_samples)
weights = total_samples / (n_classes * class_samples * 1.0)
class_weight_dict = {key : value for (key, value) in zip(classes, weights)}
class_weight_dict[classes[1]] = class_weight_dict[classes[1]] * largest_class_weight_coef
sample_weights = [class_weight_dict[y] for y in y_train]
return sample_weights
import math
weights = {}
def create_class_weight(labels_dict,mu=0.15):
total = np.sum(list(labels_dict.values()))
keys = labels_dict.keys()
class_weight = dict()
for key in keys:
score = math.log(mu*total/float(labels_dict[key]))
class_weight[key] = score if score > 1.0 else 1.0
return class_weight
labels_dict = {0: 0.8054156171284634, 1: 0.03689126784214945, 2: 0.15769311502938707}
weights.update(create_class_weight(labels_dict))
weights
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.class_weight import compute_class_weight,compute_sample_weight
rfc=RandomForestClassifier(random_state=42)
param_grid = {
'n_estimators': [50,100,150,200],
'max_features': ['auto', 'sqrt'],
'max_depth' : np.arange(14)+1,
'criterion' :['gini', 'entropy']
}
sample_weights=compute_sample_weight(weights,y_train)
grid_cv_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 3,verbose=1,scoring='recall_weighted')
grid_cv_rfc.fit(X_train, y_train,sample_weight=sample_weights)
grid_cv_rfc.best_estimator_
grid_cv_rfc.best_params_
rf_predict_train=grid_cv_rfc.best_estimator_.predict(X_train)
rf_predict_test=grid_cv_rfc.best_estimator_.predict(X_test)
classifcation_report_train_test(y_train, rf_predict_train, y_test, rf_predict_test)
scores = get_metrics(y_train,rf_predict_train,y_test,rf_predict_test,'RandomForest_GridSearchCV',scores)
Pkl_Filename = "Models/rf.pkl"
with open(Pkl_Filename, 'wb') as file:
pickle.dump(grid_cv_rfc.best_estimator_, file)
import xgboost as xgb
random_state = np.random.randint(0, 1000)
parameters = {
'max_depth': np.arange(15)+1,
'learning_rate': [0.01,0.05,0.1],
'n_estimators': [50, 100, 150],
'gamma': [0.1],
'subsample': [0.7],
}
xgb_model = xgb.XGBClassifier(objective='multi:softmax',random_state = random_state)
sample_weights=compute_sample_weight(weights,y_train)
grid_cv_xg = GridSearchCV(xgb_model, param_grid=parameters,scoring='recall_weighted',cv=3, verbose=1)
grid_cv_xg.fit(X_train,y_train,sample_weight=sample_weights)
grid_cv_xg.best_estimator_
grid_cv_xg.best_params_
xg_predict_train=grid_cv_xg.best_estimator_.predict(X_train)
xg_predict_test=grid_cv_xg.best_estimator_.predict(X_test)
classifcation_report_train_test(y_train, xg_predict_train, y_test, xg_predict_test)
#scores = get_metrics(y_train,xg_predict_train,y_test,xg_predict_test,'XGBoost_GridSearchCV',scores)
from mlxtend.plotting import plot_learning_curves
plot_learning_curves(X_train, y_train, X_test, y_test, grid_cv_xg.best_estimator_)
plt.show()
grid_cv_xg.best_estimator_.feature_importances_
feat_importances = pd.Series(grid_cv_xg.best_estimator_.feature_importances_, index = ['City', 'OnlineCommunication', 'AutomaticRefill',
'DoorstepDelivery', 'PreferredDeliveryDay', 'EmailType_CustomizedEmail',
'MailOpened_no', 'MailOpened_yes', 'MailClicked_no', 'MailClicked_yes',
'totalOrders', 'OrderFrequency',
'numberOfDaysWithBusiness', 'makingBusiness', 'ordergap'])
feat_importances_ordered = feat_importances.nlargest(n=10)
feat_importances_ordered
## Plot Importance
%matplotlib inline
feat_importances_ordered.plot(kind='barh')
plt.show()
### Adding More parameters
param_test2 = {
'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100],
'reg_lambda':[1e-5, 1e-2, 0.1, 1, 100]
}
xgb_model = grid_cv_xg.best_estimator_
sample_weights=compute_sample_weight(weights,y_train)
grid_cv_xg_1 = GridSearchCV(xgb_model, param_grid=param_test2,scoring='recall_weighted',cv=3, verbose=1)
grid_cv_xg_1.fit(X_train,y_train,sample_weight=sample_weights)
grid_cv_xg_1.best_params_
grid_cv_xg_1.best_estimator_
xg_predict_train_1=grid_cv_xg_1.best_estimator_.predict(X_train)
xg_predict_test_1=grid_cv_xg_1.best_estimator_.predict(X_test)
classifcation_report_train_test(y_train, xg_predict_train_1, y_test, xg_predict_test_1)
scores = get_metrics(y_train,xg_predict_train_1,y_test,xg_predict_test_1,'XGBoost_GridSearchCV',scores)
cnf_matrix = metrics.confusion_matrix(y_test, xg_predict_test_1)
np.set_printoptions(precision=2)
class_names = ['Low', 'Medium','High']
plt.figure(figsize=(10,5))
plot_confusion_matrix(cnf_matrix, classes=class_names,
title='Confusion matrix DT')
Pkl_Filename = "Models/xg.pkl"
with open(Pkl_Filename, 'wb') as file:
pickle.dump(grid_cv_xg_1.best_estimator_, file)
import lightgbm as lgb
parms = {
'max_depth':np.arange(15)+1,
'n_estimators':[30,50,70,100,150],
'learning_rate':[0.01,0.1,1,10],
'subsample':[0.7]
}
clf = lgb.LGBMClassifier()
CV_lgbm = GridSearchCV(estimator = clf,param_grid = parms,cv=3,verbose=1,scoring='recall_weighted')
CV_lgbm.fit(X_train,y_train,sample_weight=sample_weights)
CV_lgbm.best_params_
CV_lgbm.best_estimator_
lg_predict_train_1=CV_lgbm.best_estimator_.predict(X_train)
lg_predict_test_1=CV_lgbm.best_estimator_.predict(X_test)
classifcation_report_train_test(y_train, lg_predict_train_1, y_test, lg_predict_test_1)
#scores = get_metrics(y_train,lg_predict_train_1,y_test,lg_predict_test_1,'LGBM_GridSearchCV',scores)
params = {
'reg_alpha': [0, 1e-1, 1, 2],
'reg_lambda': [0, 1e-1, 1, 5],
'objective': ['multiclass'],
'num_class':[3]
}
clf_1 = CV_lgbm.best_estimator_
CV_lgbm_1 = GridSearchCV(estimator = clf_1,param_grid = params,cv=3,verbose=1,scoring='recall_weighted')
CV_lgbm_1.fit(X_train,y_train,sample_weight=sample_weights)
CV_lgbm_1.best_estimator_
lg_predict_train_1=CV_lgbm_1.best_estimator_.predict(X_train)
lg_predict_test_1=CV_lgbm_1.best_estimator_.predict(X_test)
classifcation_report_train_test(y_train, lg_predict_train_1, y_test, lg_predict_test_1)
#scores = get_metrics(y_train,lg_predict_train_1,y_test,lg_predict_test_1,'LGBM_GridSearchCV',scores)
from mlxtend.classifier import StackingClassifier
clf1 = grid_cv_dt.best_estimator_
clf2 = grid_cv_xg.best_estimator_
clf3 = grid_cv_xg_1.best_estimator_
lgbm = CV_lgbm_1.best_estimator_
sclf = StackingClassifier(classifiers=[clf1, clf2, clf3],
meta_classifier=lgbm)
params = {'decisiontreeclassifier__max_depth': [8],
'meta_classifier__learning_rate': [0.1],
'meta_classifier__max_depth': [8],
'meta_classifier__n_estimators': [50],
}
grid = GridSearchCV(estimator=sclf,
param_grid=params,
cv=10,
scoring='recall_weighted',verbose=1)
grid.fit(X_train, y_train)
cv_keys = ('mean_test_score', 'std_test_score', 'params')
for r, _ in enumerate(grid.cv_results_['mean_test_score']):
print("%0.3f +/- %0.2f %r"
% (grid.cv_results_[cv_keys[0]][r],
grid.cv_results_[cv_keys[1]][r] / 2.0,
grid.cv_results_[cv_keys[2]][r]))
print('Best parameters: %s' % grid.best_params_)
print('Accuracy: %.2f' % grid.best_score_)
grid.best_estimator_
grid.best_params_
stack_predict_train_1=grid.best_estimator_.predict(X_train)
stack_predict_test_1=grid.best_estimator_.predict(X_test)
classifcation_report_train_test(y_train, stack_predict_train_1, y_test, stack_predict_test_1)
scores = get_metrics(y_train,stack_predict_train_1,y_test,stack_predict_test_1,'Stacked_GridSearchCV',scores)
scores
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout
from tensorflow.keras.utils import to_categorical
nclasses = len(np.unique(y_train))
nfeatures = np.size(X_train, axis=1)
target_labels_keras = to_categorical(y_train)
label = y_test.copy()
label = to_categorical(label)
# keras model
model = Sequential()
model.add(Dense(nfeatures, activation='relu', kernel_initializer='he_normal', input_shape=(nfeatures,)))
model.add(Dense(150, activation='relu', kernel_initializer='he_normal'))
model.add(Dropout(0.2))
model.add(Dense(100, activation='relu', kernel_initializer='he_normal'))
model.add(Dropout(0.2))
model.add(Dense(50, activation='relu', kernel_initializer='he_normal'))
model.add(Dropout(0.2))
model.add(Dense(20, activation='relu', kernel_initializer='he_normal'))
model.add(Dense(15, activation='relu', kernel_initializer='he_normal'))
model.add(Dense(nclasses, activation='softmax', kernel_initializer='he_normal'))
# compile model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
#fit model
model.fit(X_train, target_labels_keras, batch_size=24,
shuffle=True,
validation_data=(X_test, label), epochs=30,class_weight=weights)
y_pred_train = model.predict(X_train)
y_pred_train = np.flip(np.argsort(y_pred_train), axis=1)[:, :5]
y_pred_train = np.argmax(y_pred_train, axis=-1)
y_pred_test = model.predict(X_test)
y_pred_test = np.flip(np.argsort(y_pred_test), axis=1)[:, :5]
y_pred_test = np.argmax(y_pred_test, axis=-1)
cnf_matrix = metrics.confusion_matrix(y_test, y_pred_test)
np.set_printoptions(precision=2)
class_names = ['Low', 'Medium','High']
plt.figure(figsize=(10,5))
plot_confusion_matrix(cnf_matrix, classes=class_names,
title='Confusion matrix DT')
df_trans = final_data.drop('CustomerID',axis=1)
df_trans.head()
from sklearn.preprocessing import QuantileTransformer
qt = QuantileTransformer(output_distribution='normal')
# from sklearn.preprocessing import PowerTransformer
# pt = PowerTransformer() ## default yeo_jhonson
df_trans['RetentionPotential'] = df_trans['RetentionPotential'].map(dict(High=2, Low=0,Medium=1))
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
le = LabelEncoder()
cat_cols =['City', 'PreferredDeliveryDay']
for col in cat_cols:
le.fit(df_trans[col])
df_trans[col] = le.fit_transform(df_trans[col])
y_scale = df_trans['RetentionPotential']
X_scale = df_trans.loc[:, df_trans.columns != 'RetentionPotential']
X_train_trans, X_test_trans, y_train_trans, y_test_trans = train_test_split(X_scale, y_scale, test_size=0.20, random_state=111,stratify = y)
print(X_train_trans.shape, X_test_trans.shape)
num_cols = ["EmailType_CustomizedEmail", 'MailOpened_no', 'MailOpened_yes','MailClicked_no','MailClicked_yes','totalOrders','OrderFrequency','numberOfDaysWithBusiness','makingBusiness','ordergap']
X_train_trans[num_cols] = qt.fit_transform(X_train_trans[num_cols])
X_test_trans[num_cols] = qt.fit_transform(X_test_trans[num_cols])
X_train_trans.head()
sns.set(style="whitegrid")
df_explore = X_train_trans[num_cols]
for cat in df_explore:
cat_num = df_explore[cat]
print("Plot for %s: total counts = %d" % (cat.upper(), len(cat_num)))
f, ax = plt.subplots(figsize=(10,5))
plot = sns.distplot(cat_num)
plot.set_xticklabels(plot.get_xticklabels(), rotation=45)
plt.show()
X_train_trans.plot(kind='box',layout=(2,8),subplots=1,figsize=(20,16))
results_base_trans = []
for m in models:
print(m['name'])
results_base_trans.append(model_selection(m['classifier'],
m['name'],
X_train_trans,
y_train_trans,X_test_trans,y_test_trans
))
print('completed')
results_base_models_trans = pd.DataFrame(results_base_trans).sort_values(by='Valid_Accuracy', ascending = False)
results_base_models_trans
parms = {
'max_depth':np.arange(15)+1,
'n_estimators':[50,100,150,200],
'learning_rate':[0.01,0.1,1,10],
'subsample':[0.7]
}
clf = lgb.LGBMClassifier()
CV_lgbm_trans = GridSearchCV(estimator = clf,param_grid = parms,cv=3,verbose=1,scoring='recall_weighted')
CV_lgbm_trans.fit(X_train_trans,y_train_trans,sample_weight=sample_weights)
CV_lgbm_trans.best_params_
lg_predict_train_trans=CV_lgbm_trans.best_estimator_.predict(X_train_trans)
lg_predict_test_trans=CV_lgbm_trans.best_estimator_.predict(X_test_trans)
classifcation_report_train_test(y_train, lg_predict_train_trans, y_test, lg_predict_test_trans)
scores = get_metrics(y_train,lg_predict_train_trans,y_test,lg_predict_test_trans,'LGBM_GridSearchCV_QTransformation_applied',scores)
parameters = {
'max_depth': np.arange(15)+1,
'learning_rate': [0.01,0.05,0.1],
'n_estimators': [50, 100, 150],
'gamma': [0.1],
'subsample': [0.7],
}
xgb_model = xgb.XGBClassifier(objective='multi:softmax',random_state = random_state)
grid_cv_xg_trans = GridSearchCV(xgb_model, param_grid=parameters,scoring='recall_weighted',cv=3, verbose=1)
grid_cv_xg_trans.fit(X_train_trans,y_train_trans,sample_weight=sample_weights)
grid_cv_xg_trans.best_params_
xg_predict_train_trans=grid_cv_xg_trans.best_estimator_.predict(X_train_trans)
xg_predict_test_trans=grid_cv_xg_trans.best_estimator_.predict(X_test_trans)
classifcation_report_train_test(y_train, xg_predict_train_trans, y_test, xg_predict_test_trans)
#scores = get_metrics(y_train,lg_predict_train_trans,y_test,lg_predict_test_trans,'LGBM_GridSearchCV_QTransformation_applied',scores)
grid_cv_xg_trans.best_estimator_
param_test2 = {
'n_estimators': [130,150,200],
'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100],
'reg_lambda':[1e-5, 1e-2, 0.1, 1, 100]
}
xgb = grid_cv_xg_trans.best_estimator_
grid_cv_xg_trans = GridSearchCV(xgb, param_grid=param_test2,scoring='recall_weighted',cv=3, verbose=1)
grid_cv_xg_trans.fit(X_train_trans,y_train_trans,sample_weight=sample_weights)
xg_predict_train_trans=grid_cv_xg_trans.best_estimator_.predict(X_train_trans)
xg_predict_test_trans=grid_cv_xg_trans.best_estimator_.predict(X_test_trans)
classifcation_report_train_test(y_train, xg_predict_train_trans, y_test, xg_predict_test_trans)
scores = get_metrics(y_train,xg_predict_train_trans,y_test,xg_predict_test_trans,'XGBoost_GridSearchCV_QTransformation_applied',scores)
scores
customer_data_test = pd.read_excel('TestData/Customerdata_Test.xlsx')
email_data_test = pd.read_excel('TestData/Emaildata_Test.xlsx')
train_data_test = pd.read_excel('TestData/Test.xlsx')
transactions_data_test = pd.read_excel('TestData/transactionsdata_Test.xlsx')
c_data_test = customer_data_test.copy()
e_data_test = email_data_test.copy()
t_data_test = train_data_test.copy()
trans_data_test = transactions_data_test.copy()
e_data_test = pd.get_dummies(data=e_data_test, columns=['EmailType', 'MailOpened','MailClicked'])
e_data_test = e_data_test.groupby('CustomerID').sum()
email_data_test["DateOfemail"] =pd.to_datetime(email_data_test["DateOfemail"])
email_data_test = email_data_test.groupby('CustomerID').agg(welcomeEmailDate=('DateOfemail', np.min), RecentInteraction=('DateOfemail', np.max))
email_test = e_data_test.merge(email_data_test, how='inner', on='CustomerID')
email_test.head()
trans_order_test = trans_data_test.groupby('CustomerID').agg(totalOrders=('OrderQuantity', np.sum),OrderFrequency = ('OrderQuantity',np.count_nonzero ))
trans_date_test = trans_data_test.groupby('CustomerID').agg(firstOrder=('DateOfOrder', np.min), lastOrder=('DateOfOrder', np.max))
transactions_test = trans_order_test.merge(trans_date_test, on = 'CustomerID', how= 'inner')
transactions_test.head()
data_test = email_test.merge(transactions_test, on = 'CustomerID' , how= 'inner')
data_test.head()
data_test = customer_data_test.merge(data_test, on = 'CustomerID' , how= 'inner')
data_test.head()
final_data_test = data_test.merge(train_data_test, on = 'CustomerID' , how= 'inner')
final_data_test.head()
final_data_test['numberOfDaysWithBusiness'] = final_data_test['RecentInteraction']-final_data_test['DateOfRegistration']
final_data_test['makingBusiness'] = pd.to_datetime(final_data_test['lastOrder'])- pd.to_datetime(final_data_test['firstOrder'])
final_data_test['ordergap'] = pd.to_datetime(final_data_test['lastOrder']).max() - pd.to_datetime(final_data_test['lastOrder'])
final_data_test['ordergap'] = final_data_test['ordergap'].dt.days
final_data_test['numberOfDaysWithBusiness'] = final_data_test['numberOfDaysWithBusiness'].dt.days
final_data_test['makingBusiness'] = final_data_test['makingBusiness'].dt.days
final_data_test = final_data_test[['CustomerID', 'City', 'OnlineCommunication',
'AutomaticRefill', 'DoorstepDelivery', 'PreferredDeliveryDay',
'EmailType_CustomizedEmail', 'MailOpened_no',
'MailOpened_yes', 'MailClicked_no',
'MailClicked_yes',
'totalOrders', 'OrderFrequency',
'numberOfDaysWithBusiness', 'makingBusiness','ordergap']]
final_data_test.head()
test_id = final_data_test['CustomerID']
final_data_test.drop('CustomerID',axis=1,inplace=True)
final_data_test[num_cols] = scaler.transform(final_data_test[num_cols])
final_data_test['PreferredDeliveryDay'] = le.transform(final_data_test['PreferredDeliveryDay'])
final_data_test['City'] = le.transform(final_data_test['City'])
test = final_data_test.copy()
test.head()
pred = grid_cv_dt.best_estimator_.predict(test)
pred
sub = pd.read_csv('submissions/pycaret_first_submission.csv')
sub
sub.drop('RetentionPotential',axis=1,inplace=True)
sub['RetentionPotential'] = pred
sub.RetentionPotential.value_counts()
sub.to_csv('dt_submission.csv',index=False)
from scipy import stats
from sklearn.preprocessing import StandardScaler,normalize
from sklearn.cluster import KMeans
from sklearn.cluster import MiniBatchKMeans
from sklearn.cluster import AgglomerativeClustering
import scipy.cluster.hierarchy as shc
from sklearn.cluster import DBSCAN
from sklearn.mixture import GaussianMixture
from sklearn.cluster import MeanShift
from sklearn.cluster import estimate_bandwidth
from sklearn import metrics
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings('ignore')
cluster_data_exp = final_data.drop(['CustomerID'],axis=1).copy()
cluster_data_1 = final_data.drop(['CustomerID'],axis=1).copy()
cluster_data.head()
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
le = LabelEncoder()
cat_cols = ['City','PreferredDeliveryDay']
for col in cat_cols:
le.fit(cluster_data[col])
cluster_data[col] = le.transform(cluster_data[col])
scores = []
for k in range(2,10):
km = KMeans(n_clusters=k,random_state=123)
km = km.fit(cluster_data)
scores.append(km.inertia_)
dfk = pd.DataFrame({'Cluster':range(2,10), 'Score':scores})
plt.figure(figsize=(8,5))
plt.plot(dfk['Cluster'], dfk['Score'], marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.show()
for i in range(4,10):
kmeans_labels=KMeans(n_clusters=i,random_state=123).fit_predict(cluster_data)
print("Silhouette score for {} clusters k-means : {} ".format(i,metrics.silhouette_score(cluster_data,kmeans_labels, metric='euclidean').round(3)))
kmeans_labels=KMeans(n_clusters=5,random_state=123).fit_predict(cluster_data)
pca = PCA(n_components=3).fit_transform(cluster_data)
fig = plt.figure(figsize=(12, 7), dpi=80, facecolor='w', edgecolor='k')
ax = plt.axes(projection="3d")
ax.scatter3D(pca.T[0],pca.T[1],pca.T[2],c=kmeans_labels,cmap='Spectral')
xLabel = ax.set_xlabel('X')
yLabel = ax.set_ylabel('Y')
zLabel = ax.set_zlabel('Z')
cluster_data['Clusters']=list(kmeans_labels)
customers=pd.DataFrame(cluster_data['Clusters'].value_counts()).rename(columns={'Clusters':'Number of Customers'})
customers.T
cluster_data.columns
cluster_data.set_index('Clusters')
grouped_num=cluster_data.groupby(by='Clusters').mean().round(1)
grouped_num = grouped_num.loc[:,['EmailType_CustomizedEmail', 'MailOpened_no',
'MailOpened_yes', 'MailClicked_no', 'MailClicked_yes', 'totalOrders',
'OrderFrequency', 'numberOfDaysWithBusiness', 'makingBusiness',
'ordergap']]
grouped_num
cluster_data_exp = pd.get_dummies(data=cluster_data_1, columns=['City', 'OnlineCommunication','PreferredDeliveryDay','RetentionPotential'])
cluster_data_exp['Clusters'] = cluster_data['Clusters']
cluster_data_exp.columns
cluster_data_exp.set_index('Clusters')
grouped_cat = cluster_data_exp.groupby('Clusters').agg(RetentionPotential_High = ('RetentionPotential_High',np.count_nonzero ),
RetentionPotential_Low = ('RetentionPotential_Low',np.count_nonzero ),
RetentionPotential_Medium = ('RetentionPotential_Medium',np.count_nonzero ),
City_CITY1 = ('City_CITY1',np.count_nonzero ),
City_CITY2 = ('City_CITY2',np.count_nonzero ),
City_CITY3 = ('City_CITY3',np.count_nonzero ),
City_CITY4 = ('City_CITY4',np.count_nonzero ),
OnlineCommunication_0 = ('OnlineCommunication_0',np.count_nonzero ),
OnlineCommunication_1 = ('OnlineCommunication_1',np.count_nonzero ),
PreferredDeliveryDay_Friday = ('PreferredDeliveryDay_Friday',np.count_nonzero ),
PreferredDeliveryDay_Monday = ('PreferredDeliveryDay_Monday',np.count_nonzero ),
PreferredDeliveryDay_Saturday = ('PreferredDeliveryDay_Saturday',np.count_nonzero ),
PreferredDeliveryDay_Sunday = ('PreferredDeliveryDay_Sunday',np.count_nonzero ),
PreferredDeliveryDay_Thursday = ('PreferredDeliveryDay_Thursday',np.count_nonzero ),
PreferredDeliveryDay_Tuesday = ('PreferredDeliveryDay_Tuesday',np.count_nonzero ),
PreferredDeliveryDay_Wednesday = ('PreferredDeliveryDay_Wednesday',np.count_nonzero ),
)
grouped_cat
final_cluster = grouped_num.merge(grouped_cat,how='inner',on='Clusters')
final_cluster
len(final_cluster.columns)
features = final_cluster.columns
plt.figure(figsize=(10,40))
for i,j in enumerate(features):
plt.subplot(13,2,i+1)
sns.barplot(final_cluster.index,final_cluster[j])
plt.title(j,fontdict={'color':'darkblue'})
plt.tight_layout()
plt.show()
high = cluster_data_exp[cluster_data_exp['RetentionPotential']=='High']
low = cluster_data_exp[cluster_data_exp['RetentionPotential']=='Low']
medium = cluster_data_exp[cluster_data_exp['RetentionPotential']=='Medium']
high.drop('RetentionPotential',axis=1,inplace=True)
low.drop('RetentionPotential',axis=1,inplace=True)
medium.drop('RetentionPotential',axis=1,inplace=True)
high_exp = high.copy()
low_exp = low.copy()
medium_exp = medium.copy()
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
cat_cols = ['City','PreferredDeliveryDay']
for col in cat_cols:
high[col] = le.fit_transform(high[col])
low[col] = le.fit_transform(low[col])
medium[col] = le.fit_transform(medium[col])
for i in [high,low,medium]:
scores = []
for k in range(2,10):
km = KMeans(n_clusters=k,random_state=123)
km = km.fit(i)
scores.append(km.inertia_)
dfk = pd.DataFrame()
dfk = pd.DataFrame({'Cluster':range(2,10), 'Score':scores})
plt.figure(figsize=(8,5))
plt.plot(dfk['Cluster'], dfk['Score'], marker='o')
plt.title('ElbowCurve')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.show()
**ElbowCurve**
for i in range(3,10):
kmeans_labels=KMeans(n_clusters=i,random_state=123).fit_predict(high)
print("Silhouette score for {} clusters k-means : {} ".format(i,metrics.silhouette_score(high,kmeans_labels, metric='euclidean').round(3)))
for i in range(3,10):
kmeans_labels=KMeans(n_clusters=i,random_state=123).fit_predict(low)
print("Silhouette score for {} clusters k-means : {} ".format(i,metrics.silhouette_score(low,kmeans_labels, metric='euclidean').round(3)))
for i in range(3,10):
kmeans_labels=KMeans(n_clusters=i,random_state=123).fit_predict(medium)
print("Silhouette score for {} clusters k-means : {} ".format(i,metrics.silhouette_score(medium,kmeans_labels, metric='euclidean').round(3)))
kmeans_labels_high = KMeans(n_clusters=3,random_state=123).fit_predict(high)
kmeans_labels_low = KMeans(n_clusters=6,random_state=123).fit_predict(low)
kmeans_labels_medium = KMeans(n_clusters=3,random_state=123).fit_predict(medium)
kmeans_labels_high
pca = PCA(n_components=3).fit_transform(high)
fig = plt.figure(figsize=(12, 7), dpi=80, facecolor='w', edgecolor='k')
ax = plt.axes(projection="3d")
ax.scatter3D(pca.T[0],pca.T[1],pca.T[2],c=kmeans_labels_high,cmap='Spectral')
xLabel = ax.set_xlabel('X')
yLabel = ax.set_ylabel('Y')
zLabel = ax.set_zlabel('Z')
pca = PCA(n_components=3).fit_transform(low)
fig = plt.figure(figsize=(12, 7), dpi=80, facecolor='w', edgecolor='k')
ax = plt.axes(projection="3d")
ax.scatter3D(pca.T[0],pca.T[1],pca.T[2],c=kmeans_labels_low,cmap='Spectral')
xLabel = ax.set_xlabel('X')
yLabel = ax.set_ylabel('Y')
zLabel = ax.set_zlabel('Z')
pca = PCA(n_components=3).fit_transform(medium)
fig = plt.figure(figsize=(12, 7), dpi=80, facecolor='w', edgecolor='k')
ax = plt.axes(projection="3d")
ax.scatter3D(pca.T[0],pca.T[1],pca.T[2],c=kmeans_labels_medium,cmap='Spectral')
xLabel = ax.set_xlabel('X')
yLabel = ax.set_ylabel('Y')
zLabel = ax.set_zlabel('Z')
high['Clusters']=list(kmeans_labels_high)
(pd.DataFrame(high['Clusters'].value_counts()).rename(columns={'Clusters':'Number of Customers'})).T
low['Clusters']=list(kmeans_labels_low)
(pd.DataFrame(low['Clusters'].value_counts()).rename(columns={'Clusters':'Number of Customers'})).T
medium['Clusters']=list(kmeans_labels_medium)
(pd.DataFrame(medium['Clusters'].value_counts()).rename(columns={'Clusters':'Number of Customers'})).T
high.set_index('Clusters')
grouped_num_high=high.groupby(by='Clusters').mean().round(1)
grouped_num_high = grouped_num.loc[:,['EmailType_CustomizedEmail', 'MailOpened_no',
'MailOpened_yes', 'MailClicked_no', 'MailClicked_yes', 'totalOrders',
'OrderFrequency', 'numberOfDaysWithBusiness', 'makingBusiness',
'ordergap']]
grouped_num_high
low.set_index('Clusters')
grouped_num_low=low.groupby(by='Clusters').mean().round(1)
grouped_num_low = grouped_num.loc[:,['EmailType_CustomizedEmail', 'MailOpened_no',
'MailOpened_yes', 'MailClicked_no', 'MailClicked_yes', 'totalOrders',
'OrderFrequency', 'numberOfDaysWithBusiness', 'makingBusiness',
'ordergap']]
medium.set_index('Clusters')
grouped_num_medium=medium.groupby(by='Clusters').mean().round(1)
grouped_num_medium = grouped_num.loc[:,['EmailType_CustomizedEmail', 'MailOpened_no',
'MailOpened_yes', 'MailClicked_no', 'MailClicked_yes', 'totalOrders',
'OrderFrequency', 'numberOfDaysWithBusiness', 'makingBusiness',
'ordergap']]
cluster_data_high = pd.get_dummies(data=high_exp, columns=['City', 'OnlineCommunication','PreferredDeliveryDay'])
cluster_data_low = pd.get_dummies(data=low_exp, columns=['City', 'OnlineCommunication','PreferredDeliveryDay'])
cluster_data_medium = pd.get_dummies(data=medium_exp, columns=['City', 'OnlineCommunication','PreferredDeliveryDay'])
cluster_data_high['Clusters'] = high['Clusters']
cluster_data_low['Clusters'] = low['Clusters']
cluster_data_medium['Clusters'] = medium['Clusters']
cluster_data_high.set_index('Clusters')
grouped_cat_high = cluster_data_high.groupby('Clusters').agg(
City_CITY1 = ('City_CITY1',np.count_nonzero ),
City_CITY2 = ('City_CITY2',np.count_nonzero ),
City_CITY3 = ('City_CITY3',np.count_nonzero ),
City_CITY4 = ('City_CITY4',np.count_nonzero ),
OnlineCommunication_0 = ('OnlineCommunication_0',np.count_nonzero ),
OnlineCommunication_1 = ('OnlineCommunication_1',np.count_nonzero ),
PreferredDeliveryDay_Friday = ('PreferredDeliveryDay_Friday',np.count_nonzero ),
PreferredDeliveryDay_Monday = ('PreferredDeliveryDay_Monday',np.count_nonzero ),
PreferredDeliveryDay_Saturday = ('PreferredDeliveryDay_Saturday',np.count_nonzero ),
PreferredDeliveryDay_Sunday = ('PreferredDeliveryDay_Sunday',np.count_nonzero ),
PreferredDeliveryDay_Thursday = ('PreferredDeliveryDay_Thursday',np.count_nonzero ),
PreferredDeliveryDay_Tuesday = ('PreferredDeliveryDay_Tuesday',np.count_nonzero ),
PreferredDeliveryDay_Wednesday = ('PreferredDeliveryDay_Wednesday',np.count_nonzero ),
)
cluster_data_low.set_index('Clusters')
grouped_cat_low = cluster_data_low.groupby('Clusters').agg(
City_CITY1 = ('City_CITY1',np.count_nonzero ),
City_CITY2 = ('City_CITY2',np.count_nonzero ),
City_CITY3 = ('City_CITY3',np.count_nonzero ),
City_CITY4 = ('City_CITY4',np.count_nonzero ),
OnlineCommunication_0 = ('OnlineCommunication_0',np.count_nonzero ),
OnlineCommunication_1 = ('OnlineCommunication_1',np.count_nonzero ),
PreferredDeliveryDay_Friday = ('PreferredDeliveryDay_Friday',np.count_nonzero ),
PreferredDeliveryDay_Monday = ('PreferredDeliveryDay_Monday',np.count_nonzero ),
PreferredDeliveryDay_Saturday = ('PreferredDeliveryDay_Saturday',np.count_nonzero ),
PreferredDeliveryDay_Sunday = ('PreferredDeliveryDay_Sunday',np.count_nonzero ),
PreferredDeliveryDay_Thursday = ('PreferredDeliveryDay_Thursday',np.count_nonzero ),
PreferredDeliveryDay_Tuesday = ('PreferredDeliveryDay_Tuesday',np.count_nonzero ),
PreferredDeliveryDay_Wednesday = ('PreferredDeliveryDay_Wednesday',np.count_nonzero ),
)
cluster_data_medium.set_index('Clusters')
grouped_cat_medium = cluster_data_medium.groupby('Clusters').agg(
City_CITY1 = ('City_CITY1',np.count_nonzero ),
City_CITY2 = ('City_CITY2',np.count_nonzero ),
City_CITY3 = ('City_CITY3',np.count_nonzero ),
City_CITY4 = ('City_CITY4',np.count_nonzero ),
OnlineCommunication_0 = ('OnlineCommunication_0',np.count_nonzero ),
OnlineCommunication_1 = ('OnlineCommunication_1',np.count_nonzero ),
PreferredDeliveryDay_Friday = ('PreferredDeliveryDay_Friday',np.count_nonzero ),
PreferredDeliveryDay_Monday = ('PreferredDeliveryDay_Monday',np.count_nonzero ),
PreferredDeliveryDay_Saturday = ('PreferredDeliveryDay_Saturday',np.count_nonzero ),
PreferredDeliveryDay_Sunday = ('PreferredDeliveryDay_Sunday',np.count_nonzero ),
PreferredDeliveryDay_Thursday = ('PreferredDeliveryDay_Thursday',np.count_nonzero ),
PreferredDeliveryDay_Tuesday = ('PreferredDeliveryDay_Tuesday',np.count_nonzero ),
PreferredDeliveryDay_Wednesday = ('PreferredDeliveryDay_Wednesday',np.count_nonzero ),
)
final_cluster_high = grouped_num_high.merge(grouped_cat_high,how='inner',on='Clusters')
final_cluster_low = grouped_num_low.merge(grouped_cat_low,how='inner',on='Clusters')
final_cluster_medium = grouped_num_medium.merge(grouped_cat_medium,how='inner',on='Clusters')
features_high = final_cluster_high.columns
features_low = final_cluster_low.columns
features_medium = final_cluster_medium.columns
plt.figure(figsize=(10,30))
for i,j in enumerate(features_high):
plt.subplot(12,2,i+1)
sns.barplot(final_cluster_high.index,final_cluster_high[j])
plt.title(j,fontdict={'color':'darkblue'})
plt.tight_layout()
plt.show()
plt.figure(figsize=(10,30))
for i,j in enumerate(features_low):
plt.subplot(12,2,i+1)
sns.barplot(final_cluster_low.index,final_cluster_low[j])
plt.title(j,fontdict={'color':'darkblue'})
plt.tight_layout()
plt.show()
Cluster 0: The more number of custmer's are occured in this cluster, the variables are City 1, City 2, city3, City4, Online Communication no, Online Communication yes, Deliveries on the Friday, Monday, Saturday, Sunday, Thurday, tuesday and wednesy these custmer's are occured in the cluster 0.
Cluster 1: The number of custmer's are occured in this cluster, the variables are Customized emails, No Mail opened, Yes Mail opened, No Mail clicked, Yes Mali Clicked, in these variables the cluster 1 has more number of custmer's occured.While in the other varibles like Total order, Order frequency, Number of Business days, Making Business and Ordergap are some custmer's are occured. In the reaming features no custmers are ocuured;
Cluster 2: In this cluster number of custmer are occured, in the variables Total order, Order frequency, Making Business and Order gap these feature are more number of custmers are occured. While in the other features some less in the feature are Eamil type customized type, No Mail Open, Yes Mail Open, No Mail Clicked and Number of Day's with Business. While in the other feature there is no single custmer's are not occured.
This is the conlusion of the features low.
plt.figure(figsize=(10,30))
for i,j in enumerate(features_medium):
plt.subplot(12,2,i+1)
sns.barplot(final_cluster_medium.index,final_cluster_medium[j])
plt.title(j,fontdict={'color':'darkblue'})
plt.tight_layout()
plt.show()
Cluster 0: In this cluster all the features custmers are occurred in almost all the features medium, In the all the features the number of custmers are domination in all features.
Cluster 1: In this clsuter only feww features are occured in this cluster they are Email Type Customized, No Mailed Open, Yes Mailed Open,No Mail Clicked, Yes Mail Clicked, Total Orders, Order Frequency, Number of Days with business, making business, ordergap. These feature medium are more number of custmer are ocuured.
Cluster 2: In this cluster only few custmers are occured in the features are, Total orders, Order Frequency, Making Business, Order gap. While in the other features least number of custmer are occured in the participated.
This is the conlusion of fetures medium.
Have to ensure that we have to centralize(concentrate) more on our medium retention potentiality customers in which our vigorous efforts have to lead them the way to high potentiality category consequentially (as a result).
For customers who shows us high potentiality, we have to make them satisfactory by laying out their needs without any complacency as usually.